Youtube Comments Sentiment Analysis
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import nltk
import os, re, csv
import inspect
import nltk.corpus
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from collections import Counter
import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.cm as cm
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
%matplotlib inline
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\X\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\X\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package vader_lexicon to [nltk_data] C:\Users\X\AppData\Roaming\nltk_data... [nltk_data] Package vader_lexicon is already up-to-date!
merged_comments = pd.read_excel('./merged_comments.xlsx')
channel_topics = pd.read_excel('./yt_channels_topics.xlsx')
print(channel_topics)
Channels Assad \
0 right-wing: NaN
1 Rubin Report NaN
2 Ben Shapiro https://www.youtube.com/watch?v=lCh0Sti3Ons
3 Steven Crowder https://www.youtube.com/watch?v=EzUkoCRGNnc
4 Paul Joseph Watson https://www.youtube.com/watch?v=kmlrt_6MwXY
5 The Hill NaN
6 centrist: NaN
7 Last Week Tonight NaN
8 Democracy Now https://www.youtube.com/watch?v=SHHoCFyHzco
9 David Pakman NaN
10 Young Turks https://www.youtube.com/watch?v=89CpGuc73qc
11 left-wing: NaN
12 Second Thought NaN
13 Vaush https://www.youtube.com/watch?v=72lgT4ku4C4
14 Majority Report https://www.youtube.com/watch?v=d12ohmPAwzo
15 F.D. Signifier NaN
16 news channels NaN
17 CNN https://www.youtube.com/watch?v=6R1JXjN05A8
18 BBC https://www.youtube.com/watch?v=vtGswyr3OwQ
19 Fox News https://www.youtube.com/watch?v=9B_NOEe_4L0
Health CEO murder \
0 NaN
1 https://www.youtube.com/watch?v=8XB2gjykF3M
2 https://www.youtube.com/watch?v=Ecwxq9fDiQU
3 NaN
4 NaN
5 https://www.youtube.com/watch?v=GSCSHMy52Jk
6 NaN
7 NaN
8 https://www.youtube.com/watch?v=jyKJKUnMBY4
9 NaN
10 https://www.youtube.com/watch?v=Mlmq1rsNqg8
11 NaN
12 NaN
13 https://www.youtube.com/watch?v=MhYWCXNXvbE
14 https://www.youtube.com/watch?v=95hy5UE9sDk
15 NaN
16 NaN
17 https://www.youtube.com/watch?v=fmmwF0hfi4I
18 https://www.youtube.com/watch?v=ZioSHAcn6gM
19 https://www.youtube.com/watch?v=nuFe2gOe_rc
Johnny Depp accused of violence \
0 NaN
1 NaN
2 https://www.youtube.com/watch?v=bYpvVQnZvDU
3 https://www.youtube.com/watch?v=vi6pvZcQ5AQ
4 NaN
5 https://www.youtube.com/watch?v=q-1KD2JWjdc
6 NaN
7 NaN
8 NaN
9 https://www.youtube.com/watch?v=GCjb-D-j2B8
10 https://www.youtube.com/watch?v=p3vf67ChfN0
11 NaN
12 NaN
13 https://www.youtube.com/watch?v=7SvYofqcdy4
14 https://www.youtube.com/watch?v=ZuYVCDzhEAk
15 https://www.youtube.com/watch?v=bblB5FtbnkU
16 NaN
17 https://www.youtube.com/watch?v=4LlO5sEpcF4
18 https://www.youtube.com/watch?v=L8S-YWXGyv8
19 https://www.youtube.com/watch?v=NWGTyS195og
Capitol
0 NaN
1 https://www.youtube.com/watch?v=QOXs-zT5v64
2 https://www.youtube.com/watch?v=m8ZfmyxmgTA
3 https://www.youtube.com/watch?v=6H_9ZY_Jabs
4 NaN
5 NaN
6 NaN
7 https://www.youtube.com/watch?v=0ri_Ma4DBks
8 https://www.youtube.com/watch?v=F1sY2fQ5pnY
9 https://www.youtube.com/watch?v=-NdutkXGc0U
10 https://www.youtube.com/watch?v=gn4-LGIWMbE
11 NaN
12 https://www.youtube.com/watch?v=FoP9ufM3bjw
13 https://www.youtube.com/watch?v=FoP9ufM3bjw
14 https://www.youtube.com/watch?v=69opVkXYq2A
15 NaN
16 NaN
17 https://www.youtube.com/watch?v=JLBLgH3PAeI
18 https://www.youtube.com/watch?v=UXR_bqyAy4E
19 https://www.youtube.com/watch?v=tVPSYr-xG6s
merged_comments.head()
| User | Comment | Likes | Published At | Time Ago | channel | topic | leaning | |
|---|---|---|---|---|---|---|---|---|
| 0 | @joseaugustofigueiredo2796 | BBC journalists must go to the doors of Britis... | 0 | 2024-12-17 11:29:42 | 2024-12-17 11:29:42 | bbc | assad | news |
| 1 | @AzadBus-h1h | Getting out of hands. It may sound off topic b... | 0 | 2024-12-15 21:38:09 | 2024-12-15 21:38:09 | bbc | assad | news |
| 2 | @Emmanuel-n7y | Imperialist go home | 0 | 2024-12-14 08:39:23 | 2024-12-14 08:39:23 | bbc | assad | news |
| 3 | @mrb6309 | It's like we've seen this movie in the Middle ... | 0 | 2024-12-13 05:23:05 | 2024-12-13 05:23:05 | bbc | assad | news |
| 4 | @musadube1647 | 😂😂😂 how dumb and foolish the Americans and the... | 0 | 2024-12-13 01:29:48 | 2024-12-13 01:29:48 | bbc | assad | news |
Cleaning dataset
In this step we perform operations of:
- cleaning text data
- dropping inessential columns
- removing NaN rows
- splitting data into separate sets
merged_comments = merged_comments.drop(['User', 'Published At', 'Time Ago'], axis=1)
print(merged_comments.head())
Comment Likes channel topic \ 0 BBC journalists must go to the doors of Britis... 0 bbc assad 1 Getting out of hands. It may sound off topic b... 0 bbc assad 2 Imperialist go home 0 bbc assad 3 It's like we've seen this movie in the Middle ... 0 bbc assad 4 😂😂😂 how dumb and foolish the Americans and the... 0 bbc assad leaning 0 news 1 news 2 news 3 news 4 news
merged_comments = merged_comments.dropna()
print(len(merged_comments))
113266
Split dataset into multiple subsets based on the topic
#unique values in the column as a vector or whatever for encoders later
uniqs = merged_comments['topic'].unique()
uniqs
array(['assad', 'ceo', 'depp', 'capitol'], dtype=object)
def splitter(channel: pd.DataFrame) -> pd.DataFrame:
channel_comments = merged_comments.loc[merged_comments['topic'] == f'{channel}']
channel_comments = channel_comments.drop(['topic'], axis=1)
return channel_comments
list_of_names = {}
assad_comments = splitter('assad')
ceo_comments = splitter('ceo')
capitol_comments = splitter('capitol')
depp_comments = splitter('depp')
print(assad_comments.head())
print(assad_comments.dtypes)
print(depp_comments.head())
Comment Likes channel leaning
0 BBC journalists must go to the doors of Britis... 0 bbc news
1 Getting out of hands. It may sound off topic b... 0 bbc news
2 Imperialist go home 0 bbc news
3 It's like we've seen this movie in the Middle ... 0 bbc news
4 😂😂😂 how dumb and foolish the Americans and the... 0 bbc news
Comment object
Likes int64
channel object
leaning object
dtype: object
Comment Likes channel leaning
5268 Absolutely disgusting so called "journalism". ... 0 cnn news
5269 Anyone who respects left-wing media should wat... 0 cnn news
5270 CNN reporting is biased. Salute to USA justice... 0 cnn news
5271 Amber Tu*d,was a se* worker,had affa*rs, with ... 0 cnn news
5272 Believe all women, every day, all day and twic... 0 cnn news
# replace special characters from text.
# substituting "/", "@" and "|" and others by a space.
list_of_names = [assad_comments, depp_comments, capitol_comments, ceo_comments]
def cleaner(comments_clean: list):
for x in comments_clean:
x = x.replace('[^a-zA-Z0-9 \n\.]', '', regex=True)
#x = x.str.lower()
cleaner(list_of_names)
assad_comments['Comment'] = assad_comments['Comment'].str.lower()
ceo_comments['Comment'] = ceo_comments['Comment'].str.lower()
capitol_comments['Comment'] = capitol_comments['Comment'].str.lower()
depp_comments['Comment'] = depp_comments['Comment'].str.lower()
print(ceo_comments.head())
Comment Likes channel leaning 3237 well he was stressed out so of course he went ... 0 cnn news 3238 police were being forceful with him . luig... 0 cnn news 3239 nullify nullify \nfree luigi 0 cnn news 3240 i am sad!! wish him well!! 0 cnn news 3241 hope we can save luigi. 0 cnn news
ceo_comments['Likes']\
.unique()
array([ 0, 2, 1, 3, 9, 4, 8, 5, 73, 22, 15,
18, 10, 130, 53, 137, 14, 36, 153, 16, 19, 6,
24, 281, 7, 43, 155, 12, 28, 198, 21, 42, 320,
72, 31, 535, 224, 68, 25, 141, 62, 11, 181, 30,
26, 215, 270, 63, 128, 790, 35, 87, 27, 227, 147,
196, 52, 1184, 66, 34, 355, 47, 69, 203, 616, 20,
622, 100, 856, 326, 286, 225, 45, 161, 29, 273, 347,
67, 156, 105, 195, 38, 104, 48, 604, 127, 50, 13,
2308, 117, 138, 480, 2260, 318, 39, 970, 76, 316, 3191,
1898, 264, 660, 851, 385, 167, 282, 525, 748, 556, 37,
208, 77, 23, 92, 44, 80, 65, 184, 640, 157, 1847,
378, 32, 268, 499, 179, 1013, 862, 2695, 103, 135, 345,
54, 114, 115, 375, 432, 1985, 51, 1000, 3507, 1949, 91,
17, 457, 79, 794, 182, 1117, 221, 1850, 96, 4073, 82,
46, 1012, 5671, 6776, 168, 232, 2844, 504, 86, 671, 106,
4531, 40, 887, 491, 610, 160, 3169, 2128, 680, 315, 2990,
251, 58, 172, 354, 708, 163, 124, 166, 183, 1314, 2061,
56, 41, 112, 289, 1823, 98, 756, 59, 330, 263, 121,
170, 154, 88, 236, 416, 442, 1157, 55, 219, 186, 430,
536, 512, 911, 538, 995, 691, 209, 341, 99, 444, 193,
142, 242, 61, 880, 204, 258, 4077, 1283, 864, 1833, 335,
544, 279, 33, 148, 211, 85, 171, 1466, 144, 905, 60,
713, 169, 420, 394, 107, 74, 71, 238, 146, 361, 125,
132, 377, 70, 49, 417, 90, 628, 415, 118, 64, 126,
119, 75, 129, 212, 331, 253, 1003, 244, 97, 323, 434,
108, 590, 145, 191, 1407, 222, 1215, 207, 402, 687, 1121,
541, 476, 406, 197, 311, 89, 486, 110, 228, 349, 175,
1275, 470, 436, 654, 1245, 152, 809, 123, 245, 1839, 159,
358, 408, 81, 220, 122, 241, 120, 93], dtype=int64)
Text Clustering
Clustering Stage consists of dividing text data into Vectors and Clusters. We perform this operations for each comments section.
tfidf = TfidfVectorizer(
min_df = 5,
max_df = 0.95,
max_features = 8000,
stop_words = 'english'
)
tfidf.fit(assad_comments['Comment'])
text_Assad = tfidf.transform(assad_comments['Comment'])
text_ceo = tfidf.transform(ceo_comments['Comment'])
text_capitol = tfidf.transform(capitol_comments['Comment'])
text_depp = tfidf.transform(depp_comments['Comment'])
def find_optimal_clusters(data, max_k):
iters = range(2, max_k+1, 2)
sse = []
for k in iters:
sse.append(MiniBatchKMeans(n_clusters=k, init_size=1024, batch_size=2048, random_state=20).fit(data).inertia_)
print('Fit {} clusters'.format(k))
f, ax = plt.subplots(1, 1)
ax.plot(iters, sse, marker='o')
ax.set_xlabel('Cluster Centers')
ax.set_xticks(iters)
ax.set_xticklabels(iters)
ax.set_ylabel('SSE')
ax.set_title(f'SSE by Cluster Center Plot of set')
find_optimal_clusters(text_Assad, 20)
find_optimal_clusters(text_ceo, 20)
find_optimal_clusters(text_capitol, 20)
find_optimal_clusters(text_depp, 20)
C:\Users\X\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1934: FutureWarning: The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=3)
Fit 2 clusters Fit 4 clusters Fit 6 clusters Fit 8 clusters Fit 10 clusters Fit 12 clusters Fit 14 clusters Fit 16 clusters Fit 18 clusters Fit 20 clusters Fit 2 clusters Fit 4 clusters Fit 6 clusters Fit 8 clusters Fit 10 clusters Fit 12 clusters Fit 14 clusters Fit 16 clusters Fit 18 clusters Fit 20 clusters Fit 2 clusters Fit 4 clusters Fit 6 clusters Fit 8 clusters Fit 10 clusters Fit 12 clusters Fit 14 clusters Fit 16 clusters Fit 18 clusters Fit 20 clusters Fit 2 clusters Fit 4 clusters Fit 6 clusters Fit 8 clusters Fit 10 clusters Fit 12 clusters Fit 14 clusters Fit 16 clusters Fit 18 clusters Fit 20 clusters
# set k to 6
clusters_assad = MiniBatchKMeans(n_clusters=6, init_size=1024, batch_size=2048, random_state=20).fit_predict(text_Assad)
clusters_ceo =MiniBatchKMeans(n_clusters=6, init_size=1024, batch_size=2048, random_state=20).fit_predict(text_ceo)
clusters_capitol =MiniBatchKMeans(n_clusters=6, init_size=1024, batch_size=2048, random_state=20).fit_predict(text_capitol)
clusters_depp =MiniBatchKMeans(n_clusters=6, init_size=1024, batch_size=2048, random_state=20).fit_predict(text_depp)
#probably overrated
def get_top_keywords(data, clusters, labels, n_terms):
df = pd.DataFrame(data.todense()).groupby(clusters).mean()
for i,r in df.iterrows():
print('\nCluster {}'.format(i))
print(','.join([labels[t] for t in np.argsort(r)[-n_terms:]]))
get_top_keywords(text_Assad, clusters_assad, tfidf.get_feature_names_out(), 15)
get_top_keywords(text_ceo, clusters_ceo, tfidf.get_feature_names_out(), 15)
get_top_keywords(text_capitol, clusters_capitol, tfidf.get_feature_names_out(), 15)
get_top_keywords(text_depp, clusters_depp, tfidf.get_feature_names_out(), 15)
Cluster 0 fan,fanatical,going,christians,say,need,live,jews,rid,earth,born,cancer,peacefully,raised,islam Cluster 1 intervention,attacks,actions,opinion,sending,exchange,demands,urge,assistance,approved,crown,prince,minorities,iran,israel Cluster 2 fascism,fascist,fast,fat,fate,fanatics,fairly,like,don,coming,greater,learn,isreal,arabs,mistakes Cluster 3 new,russia,christians,good,rebels,going,syrian,country,home,just,people,like,israel,assad,syria Cluster 4 families,family,fan,fanatical,fanatics,far,farts,fascism,fascist,familiar,zone,does,end,times,seen Cluster 5 taking,future,religious,ethnic,opportunity,sides,idk,optimistic,tolerance,expand,thier,pass,im,known,isnt Cluster 0 hear,issue,understanding,class,seriously,press,cheap,sucks,truth,probably,shouting,wasn,people,like,screaming Cluster 1 know,media,murder,left,think,man,rich,right,ceo,don,guy,just,like,ben,people Cluster 2 regards,college,earlier,right,great,got,sure,family,understand,talking,school,masters,degree,comes,kid Cluster 3 sorry,dead,ceo,health,justice,shame,police,coverage,company,mr,alive,particular,reported,sympathy,individual Cluster 4 fanatical,fanatics,familiar,country,don,freedom,love,anymore,shame,people,defense,uncle,way,feel,like Cluster 5 industry,company,pay,denied,united,profit,free,ceo,people,companies,care,health,luigi,insurance,healthcare Cluster 0 family,fan,fanatical,fanatics,far,farts,fascism,fascist,false,just,media,president,forget,wil,swamp Cluster 1 finally,just,million,simple,work,visit,honestly,amazing,think,great,comment,thank,video,thought,second Cluster 2 police,american,video,don,country,right,biden,ben,election,just,supporters,like,america,people,trump Cluster 3 like,ben,law,feel,political,shapiro,reality,bases,fast,totally,facts,views,holds,beliefs,ideals Cluster 4 families,family,fan,fanatical,fanatics,far,farts,fascism,help,needs,lies,makes,stuff,attention,mental Cluster 5 look,day,broke,peaceful,mean,know,right,deceived,don,like,people,group,trump,dressed,supporters Cluster 0 takes,woman,person,didnt,disgusting,public,eye,insane,school,ive,arent,genuinely,just,doing,annoying Cluster 1 man,abused,people,did,like,just,case,men,evidence,abuse,woman,cnn,women,heard,trial Cluster 2 fast,fascist,fascism,family,far,farts,fanatics,fanatical,zone,wannabe,married,time,doing,crazy,job Cluster 3 families,family,fan,fanatical,fanatics,far,farts,fascism,false,zone,analysis,channel,going,foot,disgusting Cluster 4 family,fan,fanatical,fanatics,far,farts,fascism,fascist,fast,families,american,bs,high,lady,bar Cluster 5 failure,fanatics,fanatical,far,like,going,time,happy,cnn,supports,media,mainstream,ah,news,fox
Sentiment Analysis
Analysis of perception by comments.
capitol_comments['Likes'] = ['0-10' if i < 10 else '11-100' if i < 100 else '101-500' if i < 500 else '501-1000' for i in capitol_comments['Likes']]
depp_comments['Likes'] = ['0-10' if i < 10 else '11-100' if i < 100 else '101-500' if i < 500 else '501-1000' for i in depp_comments['Likes']]
ceo_comments['Likes'] = ['0-10' if i < 10 else '11-100' if i < 100 else '101-500' if i < 500 else '501-1000' for i in ceo_comments['Likes']]
assad_comments['Likes'] = ['0-10' if i < 10 else '11-100' if i < 100 else '101-500' if i < 500 else '501-1000' for i in assad_comments['Likes']]
depp_comments['Likes'].hist()
<Axes: >
ceo_comments['Likes'].hist()
<Axes: >
assad_comments['Likes'].hist()
<Axes: >
Number of Comments by size performed by overall group and separated channels
capitol_comments['Likes'].hist()
<Axes: >
assad_comments.head()
| Comment | Likes | channel | leaning | |
|---|---|---|---|---|
| 0 | bbc journalists must go to the doors of britis... | 0-10 | bbc | news |
| 1 | getting out of hands. it may sound off topic b... | 0-10 | bbc | news |
| 2 | imperialist go home | 0-10 | bbc | news |
| 3 | it's like we've seen this movie in the middle ... | 0-10 | bbc | news |
| 4 | 😂😂😂 how dumb and foolish the americans and the... | 0-10 | bbc | news |
analyser = SentimentIntensityAnalyzer()
depp_comments.groupby(['leaning', 'Likes', 'channel']).size().unstack().plot(kind='bar',figsize=(11, 5), title='Johnny Depp Topic distribution', xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments')
capitol_comments.groupby(['leaning', 'Likes', 'channel']).size().unstack().plot(kind='bar',figsize=(11, 5), title='Capitol Attack Topic distribution', xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments')
ceo_comments.groupby(['leaning', 'Likes', 'channel']).size().unstack().plot(kind='bar',figsize=(11, 5), title='CEO Assasination Topic distribution', xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments')
assad_comments.groupby(['leaning', 'Likes', 'channel']).size().unstack().plot(kind='bar',figsize=(11, 5), title='Assad Fall Topic distribution', xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments')
<Axes: title={'center': 'Assad Fall Topic distribution'}, xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments'>
depp_comments.groupby(['leaning', 'channel']).size().unstack().plot(kind='bar',figsize=(11, 5), title='Johnny Depp Leaning against Channel Topic distribution', xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments')
capitol_comments.groupby(['leaning', 'channel']).size().unstack().plot(kind='bar',figsize=(11, 5), title='Capitol Attack Leaning against Channel Topic distribution', xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments')
ceo_comments.groupby(['leaning', 'channel']).size().unstack().plot(kind='bar',figsize=(11, 5), title='CEO Assasination Leaning against Channel Topic distribution', xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments')
assad_comments.groupby(['leaning', 'channel']).size().unstack().plot(kind='bar',figsize=(11, 5), title='Assad Fall Leaning against Channel Topic distribution', xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments')
<Axes: title={'center': 'Assad Fall Leaning against Channel Topic distribution'}, xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments'>
NLP Polarization Topic Modeling
Polarization of opinion by media using polarity score. It captures distribution of interest in form of numbers in range of -1 to 1 where -1 stands for purely negative position, 0 for neutral position and 1 for purely positive position.
def calculate_polarity_scores(data: pd.DataFrame):
compval1 = [ ] # empty list to hold our computed 'compound' VADER scores
i=0 # counter
while (i<len(data)):
k = analyser.polarity_scores(data.iloc[i]['Comment'])
compval1.append(k['compound'])
i = i+1
# convert sentiment values to numpy for easier usage
compval1 = np.array(compval1)
print(len(compval1))
print(compval1[0])
data['Polarity_scores'] = compval1
calculate_polarity_scores(assad_comments)
calculate_polarity_scores(ceo_comments)
calculate_polarity_scores(capitol_comments)
calculate_polarity_scores(depp_comments)
10949 -0.8217 11022 0.1779 64683 -0.6983 26612 -0.8858
ceo_comments
| Comment | Likes | channel | leaning | Polarity_scores | |
|---|---|---|---|---|---|
| 3237 | well he was stressed out so of course he went ... | 0-10 | cnn | news | 0.1779 |
| 3238 | police were being forceful with him . luig... | 0-10 | cnn | news | -0.4545 |
| 3239 | nullify nullify \nfree luigi | 0-10 | cnn | news | 0.5106 |
| 3240 | i am sad!! wish him well!! | 0-10 | cnn | news | 0.4344 |
| 3241 | hope we can save luigi. | 0-10 | cnn | news | 0.7269 |
| ... | ... | ... | ... | ... | ... |
| 108094 | his hoodie was a mid tone hue that's why it lo... | 0-10 | young_turks | centre | 0.0000 |
| 108095 | _“those who make peaceful revolution impossibl... | 11-100 | young_turks | centre | -0.5106 |
| 108096 | so young, so handsome, and an ivy league compu... | 0-10 | young_turks | centre | -0.1416 |
| 108097 | i'm tellin' y'all, that's not luigi! that's su... | 11-100 | young_turks | centre | 0.0000 |
| 108098 | the nypd said this guy is black😂 | 0-10 | young_turks | centre | 0.0000 |
11022 rows × 5 columns
# assign score categories and logic
global i
def predict_sentiment(data: pd.DataFrame):
i = 0
predicted_value = [ ] # empty series to hold our predicted values
while(i<len(data)):
if ((data.iloc[i]['Polarity_scores'] >= 0.7)):
predicted_value.append('positive')
i = i+1
elif ((data.iloc[i]['Polarity_scores'] > 0) & (data.iloc[i]['Polarity_scores'] < 0.7)):
predicted_value.append('neutral')
i = i+1
elif ((data.iloc[i]['Polarity_scores'] <= 0)):
predicted_value.append('negative')
i = i+1
data['predicted sentiment'] = predicted_value
predict_sentiment(assad_comments)
predict_sentiment(ceo_comments)
predict_sentiment(capitol_comments)
predict_sentiment(depp_comments)
ceo_comments
| Comment | Likes | channel | leaning | Polarity_scores | predicted sentiment | |
|---|---|---|---|---|---|---|
| 3237 | well he was stressed out so of course he went ... | 0-10 | cnn | news | 0.1779 | neutral |
| 3238 | police were being forceful with him . luig... | 0-10 | cnn | news | -0.4545 | negative |
| 3239 | nullify nullify \nfree luigi | 0-10 | cnn | news | 0.5106 | neutral |
| 3240 | i am sad!! wish him well!! | 0-10 | cnn | news | 0.4344 | neutral |
| 3241 | hope we can save luigi. | 0-10 | cnn | news | 0.7269 | positive |
| ... | ... | ... | ... | ... | ... | ... |
| 108094 | his hoodie was a mid tone hue that's why it lo... | 0-10 | young_turks | centre | 0.0000 | negative |
| 108095 | _“those who make peaceful revolution impossibl... | 11-100 | young_turks | centre | -0.5106 | negative |
| 108096 | so young, so handsome, and an ivy league compu... | 0-10 | young_turks | centre | -0.1416 | negative |
| 108097 | i'm tellin' y'all, that's not luigi! that's su... | 11-100 | young_turks | centre | 0.0000 | negative |
| 108098 | the nypd said this guy is black😂 | 0-10 | young_turks | centre | 0.0000 | negative |
11022 rows × 6 columns
Distribution of Sentiment Type by CEO Assassination Topic
The opinions remain mostly negative with over 65% of negative response.
ceo_plot = ceo_comments.groupby('predicted sentiment').size().plot(kind='bar')
ceo_plot.set_title('CEO Sentiment Type')
Text(0.5, 1.0, 'CEO Sentiment Type')
Distribution of Sentiment Type by Capitol Attack Topic
The opinions remain negative with over 60% of negative response.
capitol_plot = capitol_comments.groupby('predicted sentiment').size().plot(kind='bar')
capitol_plot.set_title('Capitol Sentiment Type')
Text(0.5, 1.0, 'Capitol Sentiment Type')
Distribution of Sentiment Type by Assad Fall Topic
The opinions remain mostly negative/neutral with 50% of negative response.
assad_plot = assad_comments.groupby('predicted sentiment').size().plot(kind='bar')
assad_plot.set_title('Assad Sentiment Type')
Text(0.5, 1.0, 'Assad Sentiment Type')
Distribution of Sentiment Type by Johnny Depp Trial Topic
The opinions remain negative with over 60% of negative response.
depp_plot = depp_comments.groupby('predicted sentiment').size().plot(kind='bar')
depp_plot.set_title('Depp Sentiment Type')
Text(0.5, 1.0, 'Depp Sentiment Type')
from wordcloud import WordCloud,STOPWORDS
Distribution of Sentiment by channel and sentiment leaning
The opinions show high variance on different channels among responses.
depp_comments.groupby(['predicted sentiment', 'channel']).size().unstack().plot(kind='bar',figsize=(11, 5), title='Johnny Depp Leaning against Channel Topic distribution', xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments')
ceo_comments.groupby(['predicted sentiment', 'channel']).size().unstack().plot(kind='bar',figsize=(11, 5), title='CEO Leaning against Channel Topic distribution', xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments')
capitol_comments.groupby(['predicted sentiment', 'channel']).size().unstack().plot(kind='bar',figsize=(11, 5), title='Capitol Attack Leaning against Channel Topic distribution', xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments')
assad_comments.groupby(['predicted sentiment', 'channel']).size().unstack().plot(kind='bar',figsize=(11, 5), title='Assad Fall Leaning against Channel Topic distribution', xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments')
<Axes: title={'center': 'Assad Fall Leaning against Channel Topic distribution'}, xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments'>
def get_top_keywords(data, sentiment):
stop_words = set(stopwords.words('english'))
comments = data[data['predicted sentiment'] == sentiment]['Comment']
all_words = ' '.join(comments).lower()
words = word_tokenize(all_words)
filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
word_counts = Counter(filtered_words)
top_keywords = word_counts.most_common(40)
return top_keywords
channels = ['depp_comments', 'ceo_comments', 'capitol_comments', 'assad_comments']
sentiments = ['negative', 'neutral', 'positive']
for channel in channels:
data = globals()[channel]
print(f"Top 40 keywords for {channel}:")
for sentiment in sentiments:
keywords = get_top_keywords(data, sentiment)
print(f"\n{sentiment.capitalize()} comments:")
for word, count in keywords:
print(f"{word}: {count}")
Top 40 keywords for depp_comments: Negative comments: amber: 5382 women: 3637 heard: 3488 depp: 3031 johnny: 2906 trial: 2798 abuse: 2686 people: 2607 woman: 2345 men: 2302 evidence: 2247 like: 2183 one: 2077 case: 2062 cnn: 1869 get: 1813 even: 1809 would: 1802 victim: 1532 man: 1516 believe: 1512 abused: 1368 think: 1340 know: 1311 victims: 1283 never: 1228 abuser: 1211 see: 1151 got: 1148 time: 1143 watch: 1140 lies: 1101 also: 1083 right: 999 back: 994 really: 989 said: 985 say: 981 jury: 977 lied: 962 Neutral comments: amber: 1112 like: 1044 depp: 795 johnny: 787 women: 743 trial: 723 people: 717 heard: 716 get: 539 would: 517 woman: 515 cnn: 492 one: 486 men: 483 case: 474 think: 406 evidence: 397 even: 382 man: 355 watch: 332 see: 329 never: 329 truth: 327 know: 322 want: 312 good: 305 time: 298 right: 285 really: 278 video: 275 believe: 273 marriage: 272 lol: 267 got: 266 media: 262 jury: 261 justice: 255 way: 253 said: 251 court: 247 Positive comments: like: 797 people: 644 amber: 580 johnny: 564 depp: 502 women: 491 love: 447 men: 432 would: 426 heard: 415 marriage: 409 get: 399 one: 388 woman: 360 trial: 359 case: 359 man: 345 think: 314 know: 313 good: 307 truth: 306 even: 267 see: 260 also: 257 win: 255 really: 254 want: 241 never: 239 time: 236 evidence: 233 believe: 222 god: 219 justice: 219 life: 216 make: 210 way: 195 need: 195 much: 190 could: 185 say: 184 Top 40 keywords for ceo_comments: Negative comments: people: 1649 insurance: 1075 healthcare: 970 get: 655 health: 638 ceo: 637 like: 636 system: 577 ben: 548 one: 538 us: 494 would: 491 care: 466 companies: 460 right: 450 money: 387 make: 386 murder: 385 guy: 385 man: 374 even: 367 know: 362 luigi: 351 many: 351 think: 348 time: 315 life: 314 media: 308 left: 301 need: 293 pay: 293 denied: 288 never: 279 back: 279 violence: 277 wrong: 273 going: 268 medical: 265 american: 263 way: 258 Neutral comments: people: 555 healthcare: 437 like: 432 insurance: 389 care: 382 health: 381 luigi: 275 get: 225 free: 220 would: 217 system: 210 us: 209 ben: 197 one: 182 ceo: 177 rich: 160 companies: 160 money: 156 right: 156 united: 141 think: 136 need: 125 guy: 124 time: 123 make: 122 want: 119 know: 118 pay: 118 going: 113 medical: 112 see: 112 many: 111 trump: 110 good: 108 never: 104 left: 102 much: 100 way: 100 go: 97 even: 97 Positive comments: care: 367 people: 333 health: 311 like: 264 healthcare: 232 insurance: 200 us: 174 rich: 141 get: 138 system: 137 one: 136 good: 134 would: 128 right: 126 free: 119 money: 112 united: 109 profit: 104 think: 104 ben: 98 need: 96 love: 93 even: 90 companies: 90 make: 88 luigi: 86 ceo: 86 know: 84 also: 80 life: 79 government: 79 better: 77 country: 77 pay: 75 time: 73 man: 73 best: 73 left: 73 go: 73 way: 72 Top 40 keywords for capitol_comments: Negative comments: trump: 9331 people: 8818 like: 3115 america: 3067 election: 2778 us: 2743 one: 2733 would: 2664 country: 2575 right: 2529 get: 2522 ben: 2428 antifa: 2413 biden: 2213 blm: 2161 capitol: 2035 see: 1963 police: 1919 even: 1910 think: 1903 going: 1781 president: 1779 american: 1772 time: 1754 know: 1719 left: 1706 media: 1696 violence: 1624 say: 1610 supporters: 1602 government: 1578 go: 1571 democracy: 1482 never: 1468 way: 1466 world: 1465 news: 1421 fraud: 1402 need: 1378 video: 1365 Neutral comments: trump: 3351 like: 2548 people: 2479 america: 1284 would: 1141 supporters: 1004 country: 993 us: 968 ben: 901 one: 853 good: 808 get: 796 video: 795 antifa: 762 right: 754 biden: 747 see: 707 think: 683 world: 657 want: 642 peaceful: 626 american: 621 well: 620 capitol: 614 party: 598 know: 578 election: 577 police: 573 time: 571 blm: 571 lol: 570 even: 563 democracy: 562 president: 560 great: 544 going: 536 left: 523 go: 509 never: 495 really: 489 Positive comments: trump: 1769 people: 1611 like: 1409 ha: 1343 party: 714 america: 691 would: 684 us: 668 good: 647 one: 615 great: 608 right: 587 supporters: 580 country: 579 video: 514 get: 509 think: 500 love: 495 biden: 461 god: 452 world: 442 election: 439 ben: 432 president: 429 well: 424 need: 419 even: 413 american: 406 see: 399 know: 399 make: 382 left: 368 say: 367 time: 365 really: 357 way: 356 want: 355 going: 341 take: 334 states: 333 Top 40 keywords for assad_comments: Negative comments: syria: 1258 assad: 834 people: 687 israel: 622 us: 574 war: 450 back: 429 go: 403 country: 392 syrian: 386 like: 382 going: 349 rebels: 342 russia: 341 one: 323 another: 322 get: 305 world: 289 new: 285 trump: 272 terrorist: 266 regime: 255 middle: 243 christians: 240 time: 239 years: 236 would: 235 see: 231 iran: 228 isis: 222 even: 219 know: 218 east: 217 terrorists: 212 never: 207 west: 202 syrians: 200 bad: 196 home: 193 think: 189 Neutral comments: syria: 606 like: 505 israel: 310 assad: 306 people: 288 go: 220 back: 203 good: 193 us: 193 syrian: 191 country: 189 one: 144 new: 139 ben: 135 rebels: 131 get: 130 going: 129 christians: 127 another: 124 would: 124 home: 116 know: 114 want: 107 russia: 107 god: 106 better: 105 middle: 104 world: 102 hope: 102 east: 97 take: 95 well: 94 looks: 93 great: 92 never: 92 regime: 91 thank: 89 time: 87 years: 83 isis: 83 Positive comments: syria: 360 people: 290 like: 219 assad: 166 israel: 155 country: 134 god: 133 free: 131 syrian: 131 good: 126 peace: 117 hope: 111 us: 110 one: 108 go: 93 great: 85 happy: 83 would: 83 freedom: 81 better: 81 world: 80 back: 76 see: 76 president: 75 christians: 72 time: 71 get: 71 years: 71 well: 71 jesus: 71 ben: 71 new: 70 love: 70 let: 69 regime: 67 going: 67 much: 64 also: 61 even: 61 many: 60
import pandas as pd
import random
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Download the VADER lexicon and stopwords
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('punkt')
# Initialize the VADER sentiment analyzer
analyser = SentimentIntensityAnalyzer()
# Function to get top 50 unique keywords for each sentiment
def get_unique_keywords(data, sentiment):
stop_words = set(stopwords.words('english'))
comments = data[data['predicted sentiment'] == sentiment]['Comment']
all_words = ' '.join(comments).lower()
words = word_tokenize(all_words)
filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
word_counts = Counter(filtered_words)
top_keywords = word_counts.most_common(50)
return set([word for word, count in top_keywords])
# Function to get unique keywords for each sentiment
def get_sentiment_keywords(data):
negative_keywords = get_unique_keywords(data, 'negative')
neutral_keywords = get_unique_keywords(data, 'neutral')
positive_keywords = get_unique_keywords(data, 'positive')
unique_negative = negative_keywords - neutral_keywords - positive_keywords
unique_neutral = neutral_keywords - negative_keywords - positive_keywords
unique_positive = positive_keywords - negative_keywords - neutral_keywords
return unique_negative, unique_neutral, unique_positive
# Assuming your data is in DataFrames called depp_comments, ceo_comments, capitol_comments, and assad_comments
channels = ['depp_comments', 'ceo_comments', 'capitol_comments', 'assad_comments']
for channel in channels:
data = globals()[channel]
unique_negative, unique_neutral, unique_positive = get_sentiment_keywords(data)
print(f"Top 50 unique keywords for {channel}:")
print(f"\nNegative comments: {list(unique_negative)[:50]}")
print(f"\nNeutral comments: {list(unique_neutral)[:50]}")
print(f"\nPositive comments: {list(unique_positive)[:50]}")
[nltk_data] Downloading package vader_lexicon to [nltk_data] C:\Users\X\AppData\Roaming\nltk_data... [nltk_data] Package vader_lexicon is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\X\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\X\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date!
Top 50 unique keywords for depp_comments: Negative comments: ['go', 'abused', 'going', 'victim', 'lying', 'victims', 'wrong', 'abuse', 'domestic', 'lies', 'lied', 'abuser', 'still'] Neutral comments: ['video', 'lol', 'money', 'thank'] Positive comments: ['life', 'married', 'god', 'great', 'always', 'years', 'need'] Top 50 unique keywords for ceo_comments: Negative comments: ['company', 'murder', 'take', 'person', 'someone', 'pain', 'back', 'denied', 'wrong', 'violence', 'bad'] Neutral comments: ['americans', 'trump'] Positive comments: ['really', 'government', 'great', 'country', 'best', 'love'] Top 50 unique keywords for capitol_comments: Negative comments: ['democrats', 'war', 'years', 'many', 'let', 'capital', 'news', 'violence', 'fraud', 'evidence', 'day'] Neutral comments: ['lol', 'peaceful'] Positive comments: ['ha', 'also', 'god', 'republican', 'states', 'better', 'freedom'] Top 50 unique keywords for assad_comments: Negative comments: ['terrorists', 'dictator', 'islamic', 'war', 'next', 'libya', 'al', 'bad', 'iran', 'terrorist'] Neutral comments: ['refugees', 'looks', 'help', 'turkey', 'want'] Positive comments: ['much', 'also', 'president', 'many', 'let', 'jesus', 'best', 'love', 'peace', 'happy', 'said', 'freedom']
depp_comments.groupby(['predicted sentiment', 'leaning']).size().unstack().plot(kind='bar',figsize=(11, 5), title='Johnny Depp Leaning against sentiment Topic distribution', xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments')
ceo_comments.groupby(['predicted sentiment', 'leaning']).size().unstack().plot(kind='bar',figsize=(11, 5), title='CEO Leaning against sentiment Topic distribution', xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments')
capitol_comments.groupby(['predicted sentiment', 'leaning']).size().unstack().plot(kind='bar',figsize=(11, 5), title='Capitol Attack Leaning against sentiment Topic distribution', xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments')
assad_comments.groupby(['predicted sentiment', 'leaning']).size().unstack().plot(kind='bar',figsize=(11, 5), title='Assad Fall Leaning against sentiment Topic distribution', xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments')
<Axes: title={'center': 'Assad Fall Leaning against sentiment Topic distribution'}, xlabel='Distribution of interest based on media, political spectrum and Likes', ylabel='Number of Comments'>
import sklearn
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
def clean_comment(comment, bigrams=False):
comment = comment.lower() # lower case
comment = re.sub('['+my_punctuation + ']+', ' ', comment) # strip punctuation
comment = re.sub('\s+', ' ', comment) #remove double spacing
comment = re.sub('([0-9]+)', '', comment) # remove numbers
comment_token_list = [word for word in comment.split(' ')
if word not in my_stopwords] # remove stopwords
comment_token_list = [word_rooter(word) if '#' not in word else word
for word in comment_token_list] # apply word rooter
if bigrams:
comment_token_list = comment_token_list+[comment_token_list[i]+'_'+ comment_token_list[i+1]
for i in range(len(comment_token_list)-1)]
comment = ' '.join(comment_token_list)
return comment
depp_comments['clean_comment'] = depp_comments.Comment.apply(clean_comment)
ceo_comments['clean_comment'] = ceo_comments.Comment.apply(clean_comment)
capitol_comments['clean_comment'] = capitol_comments.Comment.apply(clean_comment)
assad_comments['clean_comment'] = assad_comments.Comment.apply(clean_comment)
assad_comments.head()
| Comment | Likes | channel | leaning | Polarity_scores | predicted sentiment | clean_comment | |
|---|---|---|---|---|---|---|---|
| 0 | bbc journalists must go to the doors of britis... | 0-10 | bbc | news | -0.8217 | negative | bbc journalist must go door british nato base ... |
| 1 | getting out of hands. it may sound off topic b... | 0-10 | bbc | news | 0.9235 | positive | get hand may sound topic chanc cool translat e... |
| 2 | imperialist go home | 0-10 | bbc | news | 0.0000 | negative | imperialist go home |
| 3 | it's like we've seen this movie in the middle ... | 0-10 | bbc | news | 0.7783 | positive | like seen movi middl east play let hope coher ... |
| 4 | 😂😂😂 how dumb and foolish the americans and the... | 0-10 | bbc | news | -0.7184 | negative | 😂😂😂 dumb foolish american west trap like happe... |
from sklearn.feature_extraction.text import CountVectorizer
# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')
# apply transformation
tf_assad = vectorizer.fit_transform(assad_comments['clean_comment']).toarray()
# tf_feature_names tells us what word each column in the matric represents
tf_assad_feature_names = vectorizer.get_feature_names_out()
from sklearn.decomposition import LatentDirichletAllocation
# let us assume 10 topics
number_of_topics = 10
Topic Modeling by LDA algorithm
We are trying to figure out 10 most important topics.
model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
# apply the model
model.fit(tf_assad)
LatentDirichletAllocation(random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LatentDirichletAllocation(random_state=0)
def display_topics(model, feature_names, no_top_words):
topic_dict = {}
for topic_idx, topic in enumerate(model.components_):
topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
for i in topic.argsort()[:-no_top_words - 1:-1]]
topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
for i in topic.argsort()[:-no_top_words - 1:-1]]
return pd.DataFrame(topic_dict)
Topics weights based on multiple words
The topic weights show how common and impactful are the occurrences of different words.
no_top_words = 10
# apply transformation
tf_depp = vectorizer.fit_transform(depp_comments['clean_comment']).toarray()
# tf_feature_names tells us what word each column in the matric represents
tf_depp_feature_names = vectorizer.get_feature_names_out()
model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
# apply the model
model.fit(tf_depp)
no_top_words = 10
display_topics(model, tf_depp_feature_names, no_top_words)
| Topic 0 words | Topic 0 weights | Topic 1 words | Topic 1 weights | Topic 2 words | Topic 2 weights | Topic 3 words | Topic 3 weights | Topic 4 words | Topic 4 weights | Topic 5 words | Topic 5 weights | Topic 6 words | Topic 6 weights | Topic 7 words | Topic 7 weights | Topic 8 words | Topic 8 weights | Topic 9 words | Topic 9 weights | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | like | 1820.6 | men | 1369.4 | ’ | 3105.6 | cnn | 2537.6 | ah | 797.9 | abus | 5678.5 | depp | 2436.3 | evid | 1942.1 | watch | 946.5 | get | 1103.4 |
| 1 | peopl | 842.9 | women | 1342.1 | ’t | 2885.1 | news | 936.3 | jd | 670.1 | amber | 3093.9 | johnni | 1913.5 | trial | 1839.9 | trial | 795.1 | go | 920.7 |
| 2 | know | 822.4 | marriag | 1177.7 | it | 986.1 | lie | 712.1 | women | 491.5 | victim | 2510.3 | heard | 1278.7 | juri | 1211.6 | video | 790.8 | money | 843.7 |
| 3 | love | 697.8 | get | 1090.6 | i | 930.9 | watch | 700.9 | elain | 477.5 | heard | 2125.3 | men | 1128.5 | case | 1129.2 | peopl | 429.6 | hit | 686.6 |
| 4 | shit | 651.7 | peopl | 782.5 | don | 753.0 | media | 658.2 | accus | 320.9 | lie | 1892.8 | win | 1057.3 | watch | 1100.8 | lol | 429.4 | pay | 683.7 |
| 5 | amber | 633.7 | man | 781.8 | ” | 736.3 | amber | 649.0 | fals | 290.3 | women | 1857.0 | amber | 1017.8 | amber | 1020.1 | like | 414.4 | johnni | 659.1 |
| 6 | realli | 631.8 | marri | 767.8 | … | 725.6 | truth | 439.7 | lie | 279.3 | believ | 1560.7 | women | 957.3 | lie | 939.1 | talk | 401.8 | want | 627.9 |
| 7 | thing | 553.9 | woman | 721.4 | ’m | 594.1 | trial | 436.3 | amber | 252.6 | johnni | 1147.4 | case | 822.5 | uk | 913.9 | thank | 352.2 | would | 616.5 |
| 8 | guy | 523.8 | want | 652.3 | didn | 593.1 | liar | 388.7 | need | 210.7 | depp | 1022.2 | abus | 804.7 | judg | 755.5 | care | 329.6 | time | 563.2 |
| 9 | think | 482.6 | abus | 597.0 | she | 563.0 | fake | 313.6 | woman | 204.7 | evid | 953.4 | victim | 682.8 | bbc | 709.3 | comment | 303.4 | got | 559.7 |
# apply transformation
tf_ceo = vectorizer.fit_transform(ceo_comments['clean_comment']).toarray()
# tf_feature_names tells us what word each column in the matric represents
tf_ceo_feature_names = vectorizer.get_feature_names_out()
model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
# apply the model
model.fit(tf_ceo)
no_top_words = 10
display_topics(model, tf_ceo_feature_names, no_top_words)
| Topic 0 words | Topic 0 weights | Topic 1 words | Topic 1 weights | Topic 2 words | Topic 2 weights | Topic 3 words | Topic 3 weights | Topic 4 words | Topic 4 weights | Topic 5 words | Topic 5 weights | Topic 6 words | Topic 6 weights | Topic 7 words | Topic 7 weights | Topic 8 words | Topic 8 weights | Topic 9 words | Topic 9 weights | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | healthcar | 751.8 | ’t | 406.5 | insur | 1537.4 | ’ | 1162.7 | ceo | 719.5 | trump | 347.4 | ben | 659.5 | luigi | 731.7 | care | 355.4 | like | 765.7 |
| 1 | system | 576.8 | i | 310.1 | compani | 885.1 | ’t | 687.6 | murder | 548.8 | xd | 280.5 | right | 587.5 | free | 425.4 | american | 257.7 | look | 404.6 |
| 2 | peopl | 537.8 | … | 227.5 | health | 884.9 | it | 432.8 | peopl | 492.5 | guy | 164.0 | left | 484.0 | mangion | 229.2 | us | 236.7 | guy | 294.7 |
| 3 | countri | 288.2 | ’m | 218.1 | care | 722.7 | don | 294.3 | kill | 388.3 | peopl | 159.8 | peopl | 261.4 | back | 205.8 | health | 229.8 | say | 234.7 |
| 4 | us | 251.3 | guy | 153.0 | healthcar | 636.9 | peopl | 175.5 | man | 310.4 | stop | 133.7 | shapiro | 204.2 | go | 188.5 | work | 210.8 | show | 229.7 |
| 5 | chang | 219.4 | ’r | 152.8 | deni | 555.3 | he | 163.5 | famili | 284.9 | think | 118.6 | issu | 168.8 | pain | 141.5 | peopl | 205.6 | one | 227.8 |
| 6 | corrupt | 214.3 | know | 126.1 | pay | 534.8 | that | 161.5 | mani | 219.9 | tri | 117.6 | us | 152.4 | get | 135.5 | make | 188.9 | thing | 197.5 |
| 7 | govern | 209.0 | don | 124.7 | medic | 461.2 | insur | 141.8 | one | 219.3 | u | 116.8 | rich | 151.3 | mental | 113.5 | money | 177.5 | know | 196.3 |
| 8 | america | 193.1 | ” | 111.6 | peopl | 450.6 | care | 132.9 | get | 202.2 | want | 114.8 | class | 150.3 | peopl | 94.4 | system | 171.1 | make | 184.2 |
| 9 | american | 190.5 | 😂 | 111.4 | get | 375.8 | compani | 121.0 | death | 190.8 | report | 105.8 | media | 148.3 | black | 84.4 | healthcar | 132.1 | would | 173.6 |
# apply transformation
tf_capitol = vectorizer.fit_transform(capitol_comments['clean_comment']).toarray()
# tf_feature_names tells us what word each column in the matric represents
tf_capitol_feature_names = vectorizer.get_feature_names_out()
model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
# apply the model
model.fit(tf_capitol)
no_top_words = 10
display_topics(model, tf_capitol_feature_names, no_top_words)
| Topic 0 words | Topic 0 weights | Topic 1 words | Topic 1 weights | Topic 2 words | Topic 2 weights | Topic 3 words | Topic 3 weights | Topic 4 words | Topic 4 weights | Topic 5 words | Topic 5 weights | Topic 6 words | Topic 6 weights | Topic 7 words | Topic 7 weights | Topic 8 words | Topic 8 weights | Topic 9 words | Topic 9 weights | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | elect | 3398.3 | protest | 2858.8 | video | 2425.4 | trump | 6286.5 | peopl | 3626.2 | ’ | 3445.7 | america | 3119.9 | news | 1673.0 | evid | 1597.3 | day | 1591.7 |
| 1 | vote | 2985.2 | peopl | 1954.9 | right | 1525.1 | support | 4135.7 | media | 1130.7 | ’t | 3296.0 | trump | 2855.0 | war | 1194.3 | trump | 1399.0 | ha | 1407.1 |
| 2 | peopl | 2442.7 | polic | 1865.0 | parti | 1453.5 | antifa | 3591.6 | countri | 1090.1 | trump | 2310.0 | world | 2208.7 | civil | 883.7 | elect | 1342.5 | countri | 1402.8 |
| 3 | biden | 2241.0 | white | 1625.6 | like | 1342.9 | riot | 2228.2 | right | 1072.2 | ben | 2069.3 | us | 1841.4 | fox | 869.7 | fraud | 1196.9 | like | 1261.6 |
| 4 | democrat | 1435.6 | black | 1326.3 | xd | 988.1 | blm | 2098.1 | american | 1037.2 | peopl | 1769.0 | countri | 1783.2 | year | 850.6 | state | 1102.5 | watch | 1118.1 |
| 5 | trump | 1407.6 | peac | 1252.0 | flag | 933.1 | capitol | 1876.4 | take | 959.2 | it | 1361.1 | presid | 1563.6 | start | 791.4 | court | 951.9 | democraci | 1098.3 |
| 6 | republican | 1180.0 | terrorist | 1219.0 | one | 801.0 | build | 1501.2 | left | 958.7 | like | 1310.5 | american | 1424.2 | go | 741.2 | coup | 947.5 | america | 1044.3 |
| 7 | would | 1114.0 | call | 1128.2 | good | 781.5 | peopl | 1296.8 | govern | 957.7 | i | 1291.1 | god | 1315.4 | see | 708.8 | ben | 705.7 | american | 863.2 |
| 8 | presid | 1113.7 | would | 1047.9 | thought | 730.9 | burn | 1178.5 | back | 915.7 | don | 1178.1 | usa | 1275.2 | fake | 658.7 | lie | 676.4 | worst | 814.5 |
| 9 | go | 1073.8 | blm | 1011.8 | left | 712.6 | protest | 1057.0 | work | 872.2 | say | 1100.5 | state | 1263.5 | get | 652.3 | bbc | 565.6 | lol | 700.3 |
Cloud Words for most mentioned topics
def generate_cloudwords(data: pd.DataFrame, sentiment: str):
ceo_negative = data[data['predicted sentiment']==f'{sentiment}']
words = ' '.join(ceo_negative['Comment'])
cleaned_word = " ".join([word for word in words.split()
if 'http' not in word
and not word.startswith('@')
and word != 'RT'
])
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords,
background_color='black',
width=3000,
height=2500
).generate(cleaned_word)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis('off')
plt.title(f'Cloudword for {sentiment} {data}')
plt.show()
generate_cloudwords(ceo_comments, 'negative')
generate_cloudwords(depp_comments, 'neutral')
generate_cloudwords(assad_comments, 'positive')